Data analysis of marketing questionnaire answers

11-05-2020

In [219]:
# To do:
# Clean the data:
#     - Remove rows with missing values
#     - Transform variables into numbers (yes / no -> 1 / 0)
# Inspect data:
#     - Did students answer(?)
#     - Choose according hypotheses (do we include families, working people, etc.) - can we even compare students with
#     other groups of people?
# Prepare for regression:
#     - Check for assumptions
# Run regression
# Check the quality of regression
# Make conclusions

Importing libraries

In [220]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

Importing data

In [221]:
df = pd.read_csv(r'C:\Users\Lukas\OneDrive\University\Year 3\Marketing\Project\Deliveroo_data_cleaned.csv')

Data cleaning

Variable names

Latitude
Longitude

Male (0 is female)
Age Are you currently a student? (1 is a yes, 2 is a no)
Do you currently live with other members of your family?

How many people do you live with?
Approximately how far away from the city centre (downtown) do you live? (Km)
How many times a month do you usually shop for groceries?
How many times a month do you forget a particular item and have to return to the supermarket?
How many times a month do you order something online?
How many times a month do you order restaurant food online? Do you order groceries online? *
If yes to the previous question, how many times a month?

Would you be interested in using this type of delivery service in general, given a fair price? *
What would be the MAXIMUM delivery fee you would be willing to pay for efficiently delivered grocery store items?

Removing rows with missing values

In [222]:
# Dropping NaN values that were not made by mistake (because of questionnaire design)
# - truly unanswered
df = df.dropna(subset=['Latitude', 'Longitude', 'Male', 'Age', 'is_student',
                       'live_with_family', 'order_groceries_online', 'fast_groceries_would_use'])
In [223]:
# Fill the NaN values with 0's (because it's a slider question)
# - NaN means they meant 0
df.fillna(value=0, inplace=True)
In [224]:
# Maximum fee column is different, 0 value would not be interpreted as won't use, but like 
# won't pay, therefore it is recoded into 'Wont use'
# - 0 is not a valid fee to pay
# df.loc[df['maximum_fee_for_fast_groceries_delivery'] == 0,
#        'maximum_fee_for_fast_groceries_delivery'] = 'Wont use'

Transform variables into numbers

In [225]:
# Replace 2 (which is a 'No') with 0
tr = ['Male', 'is_student', 'live_with_family', 
      'order_groceries_online', 'fast_groceries_would_use']

for column in tr:
    df.loc[df[column] == 2, column] = 0

Data inspection

General variable inspection

In [226]:
df.describe()
Out[226]:
Latitude Longitude Male Age is_student live_with_family amount_people_living_with distance_from_city_center shop_for_groceries_per_month forget_an_item_per_month order_something_online_per_month order_restaurant_food_online_per_month order_groceries_online order_groceries_online_per_month fast_groceries_would_use maximum_fee_for_fast_groceries_delivery
count 131.000000 131.000000 131.000000 131.000000 131.000000 131.000000 131.000000 131.000000 131.000000 131.000000 131.000000 131.000000 131.000000 131.000000 131.000000 131.00000
mean 51.695290 5.763974 0.412214 28.022901 0.694656 0.541985 3.427481 5.145038 8.832061 2.519084 2.656489 2.114504 0.091603 1.076336 0.526718 2.40458
std 2.052045 2.367082 0.494123 13.856665 0.462321 0.500147 2.270312 7.539763 5.607896 2.815756 2.306416 2.808343 0.289572 5.786747 0.501202 1.80503
min 30.077805 4.312897 0.000000 14.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000
25% 51.816048 5.024857 0.000000 21.000000 0.000000 0.000000 2.000000 1.000000 4.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.00000
50% 51.927094 5.604507 0.000000 22.000000 1.000000 1.000000 3.000000 3.000000 8.000000 2.000000 2.000000 1.000000 0.000000 0.000000 1.000000 2.00000
75% 52.050301 6.010895 1.000000 25.000000 1.000000 1.000000 5.000000 5.000000 12.000000 3.000000 4.000000 3.000000 0.000000 0.000000 1.000000 4.00000
max 52.542892 31.285202 1.000000 94.000000 1.000000 1.000000 10.000000 50.000000 30.000000 15.000000 12.000000 20.000000 1.000000 50.000000 1.000000 8.00000
In [227]:
plt.figure(figsize=(15, 15))
sns.pairplot(df[['amount_people_living_with',
       'distance_from_city_center', 'shop_for_groceries_per_month',
       'forget_an_item_per_month', 'order_something_online_per_month',
       'order_restaurant_food_online_per_month']])
Out[227]:
<seaborn.axisgrid.PairGrid at 0x251cdfa4f60>
<Figure size 1080x1080 with 0 Axes>
In [228]:
df['Male'].value_counts()
Out[228]:
0.0    77
1.0    54
Name: Male, dtype: int64

Nothing to be derived from gender distribution

In [229]:
df['Age'].value_counts().head(8)
Out[229]:
22.0    29
23.0    23
21.0    16
19.0    13
20.0    10
25.0     7
60.0     5
24.0     4
Name: Age, dtype: int64

Old people?.ipynb_checkpoints/

In [230]:
df['is_student'].value_counts()
Out[230]:
1.0    91
0.0    40
Name: is_student, dtype: int64

Good amount of students and non students

In [231]:
df['live_with_family'].value_counts()
Out[231]:
1.0    71
0.0    60
Name: live_with_family, dtype: int64
In [232]:
df['amount_people_living_with'].value_counts()
Out[232]:
3.0     31
4.0     21
2.0     20
5.0     19
1.0     16
0.0      9
10.0     6
6.0      5
7.0      2
8.0      1
9.0      1
Name: amount_people_living_with, dtype: int64
In [233]:
df['distance_from_city_center'].value_counts()
Out[233]:
2.0     23
1.0     21
0.0     19
3.0     16
5.0     15
4.0      5
15.0     5
20.0     4
10.0     4
7.0      4
8.0      3
9.0      2
50.0     2
12.0     2
6.0      2
14.0     1
11.0     1
25.0     1
13.0     1
Name: distance_from_city_center, dtype: int64
In [234]:
df['shop_for_groceries_per_month'].value_counts()
Out[234]:
4.0     17
15.0    16
6.0     15
10.0    14
12.0     9
8.0      8
9.0      8
5.0      7
3.0      6
7.0      6
2.0      5
20.0     4
0.0      3
1.0      3
25.0     2
14.0     2
11.0     2
16.0     1
18.0     1
30.0     1
23.0     1
Name: shop_for_groceries_per_month, dtype: int64
In [235]:
df['forget_an_item_per_month'].value_counts()
Out[235]:
1.0     34
2.0     28
0.0     23
3.0     18
4.0      9
5.0      6
6.0      4
8.0      3
15.0     3
10.0     2
7.0      1
Name: forget_an_item_per_month, dtype: int64
In [236]:
df['order_something_online_per_month'].value_counts()
Out[236]:
2.0     32
1.0     27
0.0     19
3.0     17
5.0     13
4.0     10
7.0      7
10.0     2
6.0      2
8.0      1
12.0     1
Name: order_something_online_per_month, dtype: int64
In [237]:
df['order_restaurant_food_online_per_month'].value_counts()
Out[237]:
0.0     40
1.0     31
2.0     20
3.0     16
4.0      8
5.0      8
10.0     5
8.0      2
20.0     1
Name: order_restaurant_food_online_per_month, dtype: int64
In [238]:
df['order_groceries_online'].value_counts()
Out[238]:
0.0    119
1.0     12
Name: order_groceries_online, dtype: int64
In [239]:
df['order_groceries_online_per_month'].value_counts()
Out[239]:
0.0     116
2.0       4
4.0       3
5.0       2
50.0      1
3.0       1
1.0       1
6.0       1
9.0       1
42.0      1
Name: order_groceries_online_per_month, dtype: int64
In [240]:
df['fast_groceries_would_use'].value_counts()
Out[240]:
1.0    69
0.0    62
Name: fast_groceries_would_use, dtype: int64

Choosing a hypothesis

Students are more likely to choose

Preparing for regression

Checking assumptions (the ones that can be checked before running it)


Dependent variable to be binary
Binary logistic regression requires the dependent variable to be binary and ordinal logistic regression requires the dependent variable to be ordinal. And a value of 1 should represent the desired outcome.

Independent observations
Logistic regression requires the observations to be independent of each other. In other words, the observations should not come from repeated measurements or matched data.

No multicollinearity
Logistic regression requires there to be little or no multicollinearity among the independent variables. This means that the independent variables should not be too highly correlated with each other.

Linearity
Logistic regression assumes linearity of independent variables and log odds. although this analysis does not require the dependent and independent variables to be related linearly, it requires that the independent variables are linearly related to the log odds.

Large sample size
Logistic regression typically requires a large sample size. A general guideline is that you need at minimum of 10 cases with the least frequent outcome for each independent variable in your model. For example, if you have 5 independent variables and the expected probability of your least frequent outcome is .10, then you would need a minimum sample size of 500 (10*5 / .10)

Meaningful variables
Only the meaningful variables should be included.

Lack of outliers


Dependent variable is categorical (and binary) ++
Independent observations ++
Multicollinearity ++
In [241]:
# Creating temporary df without Latitude and Longtitude
temp = df.drop(columns=['Latitude', 'Longitude'])
In [242]:
def var_corr(varlist, savefig=False, dpi=300):
    '''
    Takes a list of independent variables, creates an image of
    a heatmap plot of the correlation values.
    '''
    # Creating a correlation table
    corr = temp.corr()

    # Creating a mask for the heatmap (removes upper part of the triangle - redundant information)
    mask = np.zeros_like(corr, dtype=np.bool)  # Copy the df shape and fill it with 0's
    mask[np.triu_indices_from(mask)] = True  # Select upper triangle of array

    # Creating a heatmap
    plt.figure(figsize=(10, 10))
    ax = sns.heatmap(
        corr,
        vmin=-1, vmax=1, center=0,
        cbar_kws={"ticks": [-1, -0.5, 0, 0.5, 1]},  # setting colorbar tick values
        cmap=sns.diverging_palette(20, 220, n=200),
        square=True,
        annot=True,
        mask=mask)

    # Tweaking X-axis labels
    ax.set_xticklabels(
        ax.get_xticklabels(),
        rotation=45,
        horizontalalignment='right')

    # Tweaking Y-axis labels
    ax.set_yticklabels(
        ax.get_yticklabels(),
        rotation=0)
    
    # Saving the figure
    if savefig == True:
        plt.savefig('Correlation heatmap.png', dpi=dpi)
        
var_corr(temp)
In [243]:
# Import function
def sm_vif(x, savefig=False, dpi=300):
    '''
    Takes independent variables, calculates their VIF for
    multicorrelation, shows the value table and generates an image.
    '''
    from statsmodels.stats.outliers_influence import variance_inflation_factor

    # Get variables for which to compute VIF and add intercept term
    x = x
    x['Intercept'] = 1  # Using statsmodels VIF calculation requires adding
                        # the intercept column, because it is not done so automatically, and 
                        # otherwise will result in incorrect VIF values.

    # Compute VIF in a new dataframe and view it
    vif = pd.DataFrame()
    vif["variables"] = x.columns
    vif["VIF"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]

    # Dropping the intercept row
    vif = vif[vif.variables != 'Intercept']

    # View results using print
    print(vif)

    # Setting figure size
    plt.figure(figsize=(8, 4))

    # Plotting a horizontal bar chart
    b = plt.barh(y=vif['variables'], width=vif['VIF'],
            height=0.5, color='darkorange')

    # adjusting
    plt.title('VIF values', fontsize=20)
    plt.xlabel('Value', fontsize=15)
    plt.ylabel('Variable', fontsize=15)

    # Removing top and right part of the frame
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)

    # Finding y-line height according to how many variables there are
    yline = vif.variables.count()
    # draw vertical line from (70, 100) < x ,x coordinates  to (70, 250) < y, y coordinates
    plt.plot([5, 5], [-0.4, yline], '--', lw=2, color='darkred')

    # Setting x value limits (plt.xlim used instead of b.set_xlim because of barh type of chart)
    plt.xlim([0, 5.5])

    # Saving the figure
    if savefig == True:
        plt.savefig('VIF values graph.png', dpi=dpi)
In [244]:
sm_vif(temp)
                                  variables       VIF
0                                      Male  1.196223
1                                       Age  2.295052
2                                is_student  2.127513
3                          live_with_family  1.395654
4                 amount_people_living_with  1.229018
5                 distance_from_city_center  1.354839
6              shop_for_groceries_per_month  1.376155
7                  forget_an_item_per_month  1.164744
8          order_something_online_per_month  1.384974
9    order_restaurant_food_online_per_month  1.329407
10                   order_groceries_online  1.141153
11         order_groceries_online_per_month  1.171545
12                 fast_groceries_would_use  1.472034
13  maximum_fee_for_fast_groceries_delivery  1.441500
Outliers ++
In [245]:
def find_del_outliers(data, df):
    '''
    Finds, deletes and plots outliers.
    '''
    
    
    
    def find_outliers(data):
        '''
        Accept an array from a dataframe and return a list of outliers.
        '''
        # Set upper and lower limit to 3 standard deviation
        data_std = data.std()
        data_mean = data.mean()
        outlier_cut_off = data_std * 3

        lower_limit  = data_mean - outlier_cut_off 
        upper_limit = data_mean + outlier_cut_off
        print(f'Upper limit: {upper_limit}')
        print(f'lower limit: {lower_limit}')

        # Populating a list with outliers
        outliers = []
        for an_outlier in data:
            if an_outlier > upper_limit or an_outlier < lower_limit:
                outliers.append(an_outlier)
        print(f'Outliers: {outliers}')
        
        
        
        
        def remove_outliers_array(data):
            '''
            Accept an array from a dataframe, remove outliers, return cleaned data as a new array.
            '''
            cleaned_array = data[(data > lower_limit) & (data < upper_limit)]

            return cleaned_array
        
        
        
        def drop_outliers_df(data, df):
            '''
            Accept an array from a dataframe and the dataframe, remove outliers, return cleaned data as a new dataframe.
            '''
            cleaned_df = df[(data > lower_limit) & (data < upper_limit)]

            return cleaned_df
        
        
        # Calling inner functions
        cleaned_array = remove_outliers_array(data)
        cleaned_df = drop_outliers_df(data, df)
        
        return outliers, cleaned_array, cleaned_df  # Has to contain everything that the final return contains
    
    
    # Returning results of both functions
    # Calling the function
    outliers, cleaned_array, cleaned_df = find_outliers(data)
    
    return outliers, cleaned_array, cleaned_df


found_outliers, cleaned_array, cleaned_df = find_del_outliers(df['forget_an_item_per_month'], df)
Upper limit: 10.966350532674136
lower limit: -5.928182593742838
Outliers: [15.0, 15.0, 15.0]
In [246]:
# Box plot outlier visualization
# sns.boxplot(data= df[['gpa', 'rank']]).set_title("GPA and Rank Box Plot")  # Example
In [247]:
# Continuous var columns
continuous_columns = ['amount_people_living_with',
       'distance_from_city_center', 'shop_for_groceries_per_month',
       'forget_an_item_per_month', 'order_something_online_per_month',
       'order_restaurant_food_online_per_month',
       'order_groceries_online_per_month',
       'maximum_fee_for_fast_groceries_delivery']

def print_outliers(continuous_columns):
    '''
    Prints column name, upper and lower limit (3 S.D.) and a list of outliers. 
    (Dependancy on find_del_outliers.)
    '''
    for column in cont_columns:
        print(column)
        found_outliers = find_del_outliers(df[column], df)
        print('\n')
        
print_outliers(continuous_columns)
amount_people_living_with
Upper limit: 10.23841647715948
lower limit: -3.3834546450984124
Outliers: []


distance_from_city_center
Upper limit: 27.764328518175965
lower limit: -17.474252182298102
Outliers: [50.0, 50.0]


shop_for_groceries_per_month
Upper limit: 25.65575043166789
lower limit: -7.991628294263313
Outliers: [30.0]


forget_an_item_per_month
Upper limit: 10.966350532674136
lower limit: -5.928182593742838
Outliers: [15.0, 15.0, 15.0]


order_something_online_per_month
Upper limit: 9.575735875623241
lower limit: -4.262758776386601
Outliers: [12.0, 10.0, 10.0]


order_restaurant_food_online_per_month
Upper limit: 10.539531543172753
lower limit: -6.310523909584968
Outliers: [20.0]


order_groceries_online_per_month
Upper limit: 18.436575545428447
lower limit: -16.283903789703253
Outliers: [42.0, 50.0]


maximum_fee_for_fast_groceries_delivery
Upper limit: 7.819669659123978
lower limit: -3.0105093537804666
Outliers: [8.0]


Linearity

Logistic regression does not require the continuous IV(s) to be linearly related to the DV. It does require the continuous IV(s) be linearly related to the log odds of the IV though. A way to test this is to plot the IV(s) in question and look for an S-shaped curve. Sometimes the S-shape will not be obvious. The plot should have a flat or flat-ish top and bottom with an increase or decreasing middle.

In [248]:
def linearity_log_odds_vs_indep_vars(df, dep_var_string, string_list_of_ind_vars):
    '''
    Takes df name, dependent var name and a list of independent variable names and 
    visually exames the linearity between independent variables and dependent log odds.
    '''
    plt.figure(figsize=(10, 10))
    
#     fig, axs = plt.subplots(ncols=int(len(string_list_of_ind_vars)/2), nrows=int(len(string_list_of_ind_vars)/2))
    
    counter = 0
    for variable in string_list_of_ind_vars:
        plt.figure(variable)
        sns.regplot(variable, dep_var_string, data=df, logistic=True).set_title(variable)
        counter += 1
        
#     sns.regplot(df[string_list_of_ind_vars[1]], df[dep_var_string], logistic=False).set_title(string_list_of_ind_vars[1])


# Visually checking if the linearity is present
continuous_columns = ['amount_people_living_with',
       'distance_from_city_center', 'shop_for_groceries_per_month',
       'forget_an_item_per_month', 'order_something_online_per_month',
       'order_restaurant_food_online_per_month',
       'order_groceries_online_per_month',
       'maximum_fee_for_fast_groceries_delivery']
        
linearity_log_odds_vs_indep_vars(df, 'fast_groceries_would_use', continuous_columns)
C:\Users\Lukas\Anaconda3\lib\site-packages\statsmodels\genmod\families\family.py:894: RuntimeWarning:

invalid value encountered in true_divide

<Figure size 720x720 with 0 Axes>
Large enough sample size
In [249]:
def logistic_reg_sample(df, dependent_var_column_name, independent_var_count):
    '''
    Takes the dependent variable column (finds the ratio between 0 and 1 values of the dependent variable) and independent
    variable count, and returns & prints the minimal sample size.
    '''
    # Finding the minimum and maximum value counts, or how many observations of each category of the bivariate variable there are.
    minority_count = df[dependent_var_column_name].value_counts().min()
    majority_count = df[dependent_var_column_name].value_counts().max()
    
    # Calculating the ratio
    total_count = minority_count + majority_count
    ratio = minority_count / total_count
    
    # Calculating the minimum sample size
    min_sample = (10 * independent_var_count) / ratio
    
    # Calculating if the current sample size large enough
    if total_count >= min_sample:
        print('Sample size large enough. PASS\n')
    else:
        print('Sample size not large enough. FAIL\n')
    
    # Printing out the results
    print('The closer the ratio of the dependent variable categories is to 50/50, the less observations are needed.')
    print('The larger the amount of independent variables, the more observations are needed.\n')
    print(f'Minimal recommended sample size is: {int(min_sample)}')

logistic_reg_sample(df, 'fast_groceries_would_use', 12)
Sample size not large enough. FAIL

The closer the ratio of the dependent variable categories is to 50/50, the less observations are needed.
The larger the amount of independent variables, the more observations are needed.

Minimal recommended sample size is: 253
Meaningful variables
In [250]:
R = np.eye(len(logistic_model.params))
logistic_model.wald_test(R)
print(R)
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]

Running the regression

In [251]:
# Regression variables

# Original list
# X = df[['Male', 'Age', 'is_student',
#        'live_with_family', 'amount_people_living_with',
#        'distance_from_city_center', 'shop_for_groceries_per_month',
#        'forget_an_item_per_month', 'order_something_online_per_month',
#        'order_restaurant_food_online_per_month', 'order_groceries_online',
#        'order_groceries_online_per_month', 'fast_groceries_would_use',
#        'maximum_fee_for_fast_groceries_delivery']]

X = df[['Male', 'Age', 'is_student',
       'live_with_family', 'amount_people_living_with',
       'distance_from_city_center', 'shop_for_groceries_per_month',
       'forget_an_item_per_month', 'order_something_online_per_month',
       'order_restaurant_food_online_per_month', 'order_groceries_online',
       'order_groceries_online_per_month']]

Y = df['fast_groceries_would_use']

# Running the regression
X = sm.add_constant(X)  # add an intercept (beta_0) to our model (which is 1.0)
logistic_model = sm.Logit(Y, X).fit(cov_type="HC1")

# Print out the regression output
# print(logistic_model.summary())  # Expanded summary
print(logistic_model.summary2())  # Brief summary
Optimization terminated successfully.
         Current function value: 0.577583
         Iterations 7
                                   Results: Logit
=====================================================================================
Model:                   Logit                        Pseudo R-squared:     0.165    
Dependent Variable:      fast_groceries_would_use     AIC:                  177.3268 
Date:                    2020-05-19 21:37             BIC:                  214.7044 
No. Observations:        131                          Log-Likelihood:       -75.663  
Df Model:                12                           LL-Null:              -90.615  
Df Residuals:            118                          LLR p-value:          0.0028873
Converged:               1.0000                       Scale:                1.0000   
No. Iterations:          7.0000                                                      
-------------------------------------------------------------------------------------
                                        Coef.  Std.Err.    z    P>|z|   [0.025 0.975]
-------------------------------------------------------------------------------------
const                                   1.3415   1.4824  0.9050 0.3655 -1.5640 4.2470
Male                                   -0.6836   0.4347 -1.5725 0.1158 -1.5356 0.1684
Age                                    -0.0392   0.0276 -1.4196 0.1557 -0.0934 0.0149
is_student                             -0.1720   0.6467 -0.2659 0.7903 -1.4395 1.0956
live_with_family                        0.2205   0.4329  0.5093 0.6105 -0.6280 1.0691
amount_people_living_with               0.0604   0.0923  0.6546 0.5127 -0.1205 0.2414
distance_from_city_center               0.0219   0.0300  0.7295 0.4657 -0.0369 0.0807
shop_for_groceries_per_month           -0.0883   0.0537 -1.6443 0.1001 -0.1934 0.0169
forget_an_item_per_month                0.1089   0.0739  1.4729 0.1408 -0.0360 0.2538
order_something_online_per_month       -0.1823   0.1196 -1.5243 0.1274 -0.4167 0.0521
order_restaurant_food_online_per_month  0.3636   0.1328  2.7371 0.0062  0.1032 0.6239
order_groceries_online                  1.6688   0.8271  2.0177 0.0436  0.0477 3.2898
order_groceries_online_per_month       -0.0292   0.0269 -1.0870 0.2770 -0.0819 0.0235
=====================================================================================

Checking the quality of the regression

In [252]:
# Unofficial quality check criteria

# Finding the predicted values
predicted_raw = logistic_model.predict()

# If predicted value is > 0.5 then the prediction is turned to 1 (or 'Yes'), and vice versa
predicted = []
for value in predicted_raw:
    
    if value > 0.5:
        predicted.append(1)
        
    elif value < 0.5:
        predicted.append(0)

# Assigning predicted values to the data frame
df['predicted'] = predicted

# Making a column with 'True' if the prediction is correct and 'False' if it's wrong
df['is_true'] = df['fast_groceries_would_use'] == df['predicted']

# Displaying the results
df['is_true'].head(5)

# Counting the amount of correct and incorrect guesses
acc = df[['predicted', 'is_true']].groupby('is_true').count()
print(acc)

# Confusion matrix
cm = logistic_model.pred_table()
print('\nConfusion matrix: \n', cm)

# Calculating the accuracy of the model
accuracy = (acc.iloc[1, 0] / (acc.iloc[0, 0] + acc.iloc[1, 0]))*100
print(f'\nAccuracy: {accuracy:0.1f} %')
         predicted
is_true           
False           36
True            95

Confusion matrix: 
 [[43. 19.]
 [17. 52.]]

Accuracy: 72.5 %

Interpreting results and making conclusions

Interpreting the log odds is not very straight forward when thinking about it’s effects. An easier way to interpret the findings is by converting the coefficients of the logistic regression model into odd ratios. This can be done by getting the exponent of the coefficient value.

In [253]:
# GETTING THE ODDS RATIOS, Z-VALUE, AND 95% CI
model_odds = pd.DataFrame(np.exp(logistic_model.params), columns= ['Odds ratio'])  # 1 is that it is going to happen(?)
model_odds['z-value']= logistic_model.pvalues
model_odds[['2.5%', '97.5%']] = np.exp(logistic_model.conf_int())
model_odds
Out[253]:
Odds ratio z-value 2.5% 97.5%
const 3.824858 0.365488 0.209307 69.895237
Male 0.504808 0.115838 0.215328 1.183458
Age 0.961540 0.155714 0.910860 1.015039
is_student 0.841998 0.790298 0.237042 2.990865
live_with_family 1.246725 0.610509 0.533637 2.912698
amount_people_living_with 1.062293 0.512704 0.886481 1.272972
distance_from_city_center 1.022120 0.465712 0.963767 1.084007
shop_for_groceries_per_month 0.915531 0.100106 0.824118 1.017084
forget_an_item_per_month 1.115020 0.140788 0.964635 1.288850
order_something_online_per_month 0.833348 0.127431 0.659212 1.053484
order_restaurant_food_online_per_month 1.438451 0.006199 1.108741 1.866208
order_groceries_online 5.305698 0.043624 1.048902 26.838003
order_groceries_online_per_month 0.971217 0.277024 0.921398 1.023730

Additional geographic mapping

In [254]:
# Finding edges of the map (adding or subtracting 2, to zoom out from the area which has values, 
# to get a more bird-eye-view picture)
zoom = 0.25

lat_min = 50.988007 - zoom
lat_max = df['Latitude'].max() + zoom  # Removed because of an outlier

lon_min = df['Longitude'].min() - zoom
lon_max = 6.816895 + zoom  # Removed because of an outlier + 2

print(lat_min, lat_max, lon_min, lon_max)

df['Latitude'].nsmallest(5)  # Returns a series object with a sorted list of lowest values (nsmallest / nlargest)
50.738007 52.79289246 4.062896728999999 7.066895
Out[254]:
12    30.077805
79    43.708496
24    50.988007
21    51.141205
40    51.241104
Name: Latitude, dtype: float64
In [255]:
# Removing coordinate outliers
df_new = remove_outlier(df, 'Latitude')
51.464668270000004
52.40167998999999
In [256]:
def scatter_on_map(lat, lon, savefig=False, dpi=300, figsize=(15, 8),
                  lon_min=0, lon_max=0, lat_min=0, lat_max=0):
    '''
    Plot a map and scatter points on top of it, generate an image.
    '''
    import cartopy.crs as ccrs
    import cartopy.feature as cfeature

    lat = lat
    lon = lon

    ##### Setting figure size
    plt.figure(figsize=figsize)

    # Creating a projection
    ax = plt.axes(projection=ccrs.PlateCarree())

    # Set background image (Check https://visibleearth.nasa.gov/collection/1484/blue-marble for more background images)
    # ax.stock_img()  # Setting background to stock image

    # Zooming in on a specified area
    if lon_min != 0:
        ax.set_extent([lon_min, lon_max, lat_min, lat_max])

    # Removing image size limits
    from PIL import Image
    Image.MAX_IMAGE_PIXELS = None
    # Setting up the environment directory
    import os  # Kernel needs to be reset if changes to image.json are made
    os.environ["CARTOPY_USER_BACKGROUNDS"] = r'C:\Users\Lukas\OneDrive\Projects\WB\Articles\Data Analysis\Geographical data\Meteor_data_visualization-master\cartopy_metadata'
    # Setting the image as a background
    ax.background_img(name='BM', resolution='high')

    # Removing everything but the scatter plot
    plt.axis('off')

    # Creating a scatter plot
    plt.scatter(lon, lat, s=90, alpha=0.6,
                  marker='.', color='red', edgecolors='white', linewidths=0.5,
                    transform=ccrs.PlateCarree())

    # Saving the figure
    if savefig == True:
        plt.savefig('Scatter on map.png', bbox_inches='tight', dpi=dpi)
        
# scatter_on_map(lat, lon, lon_min=lon_min, lon_max=lon_max, lat_min=lat_min, lat_max=lat_max)
In [257]:
# Import the necessaries libraries
import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode()
# Plotting the respondant map
def plotly_map_scatter(lon, lat, text, title, market_color='red', geo_scope='europe'):
    '''
    Display an interactive plotly map with country vectors and an overlaid scatter plot.
    '''
    fig = go.Figure(data=go.Scattergeo(
            lon = lon,
            lat = lat,
            text = text,
            mode = 'markers',
            marker_color = 'red',
            ))

    fig.update_layout(
            title = title,
            geo_scope= geo_scope,
            autosize=False,
            width=1000,
            height=1000,
            margin=dict(
                l=50,
                r=50,
                b=100,
                t=100,
                pad=4
        ))

    fig.show()

lon = df['Longitude']
lat = df['Latitude']
text = df['Male']
title = 'Survey respondant location'
geo_scope='europe'
    
plotly_map_scatter(lon, lat, text, title)
In [ ]: